Chapter 3 Data statistics

load("data/data.Rdata")

3.1 Sequencing reads statistics

sample_metadata %>% 
    summarise(Total=sum(reads_post_fastp * 150 / 1000000000) %>% round(2), 
              mean=mean(reads_post_fastp * 150 / 1000000000) %>% round(2),
              sd=sd(reads_post_fastp * 150 / 1000000000) %>% round(2)) %>%
    unite("Average",mean, sd, sep = " ± ", remove = TRUE) %>%
    tt()
tinytable_n7tgboikw6miehekdlm2
Total Average
171.92 4.91 ± 1.46

3.2 DNA fractions

sequence_fractions <- read_counts %>%
  pivot_longer(-genome, names_to = "sample", values_to = "value") %>%
  group_by(sample) %>%
  summarise(mags = sum(value)) %>%
    left_join(sample_metadata, by = join_by(sample == sample)) %>%
    select(sample,mags,metagenomic_bases,host_bases,bases_lost_fastp_percent) %>%
    mutate(mags_bases = mags*146) %>%
    mutate(lowqual_bases = ((metagenomic_bases+host_bases)/(1-bases_lost_fastp_percent))-(metagenomic_bases+host_bases)) %>%
    mutate(unmapped_bases = metagenomic_bases - mags_bases) %>%
    mutate(unmapped_bases = ifelse(unmapped_bases < 0, 0, unmapped_bases)) %>%
    select(sample, lowqual_bases, host_bases, unmapped_bases, mags_bases)

sequence_fractions %>%
  mutate_at(vars(-sample), ~./1000000000) %>%
  rename("Sample"=1, "Low quality"=2, "Mapped to host"=3, "Unmapped"=4, "Mapped to MAGs"=5) %>%
  tt()
tinytable_y5x5kypkabcgu5cym5ba
Sample Low quality Mapped to host Unmapped Mapped to MAGs
EHI01966 0.3568965 0.005030679 1.9178067 2.507921
EHI01967 0.2692620 0.000709418 0.8339877 4.617569
EHI01968 0.3467658 0.000167184 1.2259195 5.669136
EHI01970 0.2568282 0.000235551 1.1356328 3.939774
EHI01979 0.3068304 0.000429188 1.1643291 3.963429
EHI01982 0.2321305 0.003106822 1.2765585 2.120459
EHI01985 0.3350121 0.000294977 1.3705302 4.091743
EHI01989 0.3249292 0.000570792 1.2444290 3.406020
EHI01992 0.2112149 0.000650860 1.3810604 2.863645
EHI01995 0.2441431 0.001014484 1.0559948 2.625781
EHI02065 0.2881934 0.000248787 0.9495023 3.524029
EHI02067 0.4887928 0.009035522 1.5062498 3.314287
EHI02079 0.4087200 0.001541555 1.0763461 3.216692
EHI02088 0.5380240 0.002508850 1.7069668 4.880649
EHI02097 1.1086051 0.001111047 1.1464450 4.177690
EHI02105 0.4367774 0.006671326 0.7868596 1.606939
EHI02582 0.1565447 0.000041661 0.7099037 3.061117
EHI02584 0.2131393 0.000243149 1.3719978 3.909083
EHI02585 0.3055922 0.000693419 1.3837783 4.156742
EHI02587 0.2002879 0.001053113 1.1718219 3.773711
EHI02592 0.2204754 0.000807086 1.0309674 3.998601
EHI02598 0.1922634 0.000085434 1.1517125 3.984019
EHI02602 0.1798841 0.000148220 1.2512663 2.802077
EHI02603 0.2165665 0.002208379 1.3526444 3.734602
EHI02607 0.2170496 0.000239951 1.5274046 3.978722
EHI02612 0.1376030 0.000170810 0.6921896 2.540514
EHI02615 0.1893992 0.000351599 0.6650246 1.618650
EHI02617 0.1957782 0.000243466 1.3981847 2.879747
EHI02619 0.2292718 0.000086283 0.9636172 3.865591
EHI02624 0.2077297 0.000351284 1.0437963 3.443889
EHI02625 0.1169758 0.000023503 0.5396744 1.996437
EHI02630 0.1994379 0.000258866 0.9279985 3.240507
EHI02632 0.3725667 0.000748839 1.4198031 2.929934
EHI02633 0.2780827 0.000903128 1.3810886 5.477885
EHI02639 0.4685676 0.001126284 2.1791314 8.022893
sequence_fractions %>%
  mutate_at(vars(-sample), ~./1000000000) %>%
  rename("Sample"=1, "Low quality"=2, "Mapped to host"=3, "Unmapped"=4, "Mapped to MAGs"=5) %>%
  summarise(across(where(is.numeric), mean, na.rm = TRUE)) %>%
  tt()
tinytable_yqak7pxkszyhh78u1h9m
Low quality Mapped to host Unmapped Mapped to MAGs
0.2985812 0.001231758 1.198304 3.598299
sequence_fractions %>%
    pivot_longer(!sample, names_to = "fraction", values_to = "value") %>%
    mutate(value = value / 1000000000) %>%
    mutate(fraction = factor(fraction, levels = c("lowqual_bases","host_bases","unmapped_bases","mags_bases"))) %>%
  
    ggplot(., aes(x = sample, y = value, fill=fraction)) +
        geom_bar(position="stack", stat = "identity") +
      scale_fill_manual(name="Sequence type",
                    breaks=c("lowqual_bases","host_bases","unmapped_bases","mags_bases"),
                    labels=c("Low quality","Mapped to host","Unmapped","Mapped to MAGs"),
                    values=c("#CCCCCC", "#bcdee1", "#d8b8a3","#93655c"))+
        labs(x = "Samples", y = "Amount of data (GB)") +
        theme_classic() +
        theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1, size=6),legend.position = "bottom")

3.3 Recovered microbial fraction

singlem_table <- sequence_fractions %>%
    mutate(mags_proportion = round((mags_bases / (mags_bases + unmapped_bases))*100,2)) %>%
    left_join(sample_metadata, by = join_by(sample == sample))  %>%
    mutate(singlem_proportion = round(singlem_fraction*100,2)) %>%
    select(sample,mags_proportion,singlem_proportion) %>%
    mutate(mags_proportion = ifelse(singlem_proportion == 0, 0, mags_proportion)) %>% #convert zeros to NA
    mutate(singlem_proportion = ifelse(singlem_proportion == 0, NA, singlem_proportion)) %>% #convert zeros to NA
    mutate(singlem_proportion = ifelse(singlem_proportion < mags_proportion, NA, singlem_proportion)) %>% #if singlem is smaller, then NA, to simplify plot
    mutate(singlem_proportion = ifelse(singlem_proportion > 100, 100, singlem_proportion)) #simplify

singlem_table %>%
    pivot_longer(!sample, names_to = "proportion", values_to = "value") %>%
    mutate(proportion = factor(proportion, levels = c("mags_proportion","singlem_proportion"))) %>%
    ggplot(., aes(x = value, y = sample, color=proportion)) +
            geom_line(aes(group = sample), color = "#f8a538") +
            geom_point() +
      scale_color_manual(name="Proportion",
                    breaks=c("mags_proportion","singlem_proportion"),
                    labels=c("Recovered","Estimated"),
                    values=c("#52e1e8", "#876b53"))+
            theme_classic() +
            labs(y = "Samples", x = "Prokaryotic fraction (%)") +
        theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1, size=6),legend.position = "right")

damr <- singlem_table %>%
  mutate(damr=ifelse(is.na(singlem_proportion),100,mags_proportion/singlem_proportion*100)) %>%
  select(sample,damr)

damr %>% 
  tt()
tinytable_vq6e5ik22s8zzdac1tjw
sample damr
EHI01966 100.00000
EHI01967 100.00000
EHI01968 100.00000
EHI01970 100.00000
EHI01979 100.00000
EHI01982 100.00000
EHI01985 100.00000
EHI01989 100.00000
EHI01992 100.00000
EHI01995 100.00000
EHI02065 100.00000
EHI02067 100.00000
EHI02079 100.00000
EHI02088 100.00000
EHI02097 100.00000
EHI02105 100.00000
EHI02582 100.00000
EHI02584 99.78431
EHI02585 100.00000
EHI02587 100.00000
EHI02592 100.00000
EHI02598 100.00000
EHI02602 100.00000
EHI02603 100.00000
EHI02607 100.00000
EHI02612 100.00000
EHI02615 100.00000
EHI02617 100.00000
EHI02619 100.00000
EHI02624 100.00000
EHI02625 100.00000
EHI02630 100.00000
EHI02632 100.00000
EHI02633 100.00000
EHI02639 100.00000
damr %>% 
  filter(damr>95) %>%
  nrow()
[1] 35
damr %>% 
  summarise(mean=mean(damr),sd=sd(damr)) %>% 
  tt()
tinytable_1iknus30h3bv3n4b7s4n
mean sd
99.99384 0.03645853